Este es el procedimiento que seguirán nuevos datos observados.
Además, se generará al final del mismo una función acumulando todo el procedimiento para guardarla en un Pipeline. Esto nos resultará sumamente útil para el momento de puesta en producción del proyecto.
# primero todas las librerías por bloques
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
import plotly.express as px
import pickle
from random import seed
data = pd.read_csv("../data/Untransformed_test_data_personal_prob.csv",low_memory=False)
data.set_index(['Unnamed: 0'], inplace=True)
data.index.names = ['']
data.head()
| C_YEAR | C_MNTH | C_WDAY | C_HOUR | C_VEHS | C_CONF | C_RCFG | C_WTHR | C_RSUR | V_ID | V_TYPE | V_YEAR | P_ID | P_SEX | P_AGE | P_PSN | P_SAFE | P_USER | C_SEV | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 87310 | 1999 | 03 | 7 | 09 | 02 | 34 | 02 | 1 | 2 | 02 | 01 | 1993 | 02 | F | 03 | 13 | 02 | 2 | 2 |
| 2333390 | 2004 | 09 | 2 | 12 | 02 | 01 | 2 | 1 | 01 | 01 | 1989 | 01 | M | 43 | 11 | 02 | 1 | 2 | |
| 1485833 | 2002 | 08 | 2 | 22 | 03 | 21 | UU | 1 | 1 | 01 | 01 | 1986 | 01 | F | 25 | 11 | 02 | 1 | 2 |
| 1155103 | 2001 | 10 | 4 | 18 | 02 | 21 | 01 | 1 | 1 | 02 | 01 | 1988 | 01 | M | 46 | 11 | UU | 1 | 2 |
| 3970807 | 2009 | 02 | 3 | 06 | 02 | 35 | UU | 1 | Q | 02 | 01 | 2004 | 01 | M | 45 | 11 | 02 | 1 | 2 |
print('Dataset shape:',data.shape)
print('Dataset shape sin duplicados:',data.drop_duplicates().shape)
Dataset shape: (1463204, 19) Dataset shape sin duplicados: (1463204, 19)
Debemos no considerar registros duplicados ya que pueden modificar nuestro analisis
data = data.drop_duplicates()
data[['C_SEV']].info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 1463204 entries, 87310 to 3804043 Data columns (total 1 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 C_SEV 1463204 non-null int64 dtypes: int64(1) memory usage: 22.3 MB
objetivo = pd.DataFrame(data['C_SEV'].value_counts())
objetivo.columns = ['Acc con Fatalidades']
objetivo['Porcentaje'] = data['C_SEV'].value_counts()*100/len(data)
objetivo.reset_index(inplace=True)
objetivo.rename(columns={'index':'Values'}, inplace=True)
#objetivo
#objetivo.reset_index(inplace=True)
fig = px.bar(objetivo, x="Values", y=['Porcentaje', 'Acc con Fatalidades'])
fig.show();
y = pd.get_dummies(data['C_SEV'])
y.columns = ['Fatalities', 'No Fatalities']
data['C_SEV'] = y['Fatalities']
y = y['Fatalities']
y.value_counts()
0 1438551 1 24653 Name: Fatalities, dtype: int64
data.isnull().sum().sort_values(ascending=False)
C_VEHS 1 C_YEAR 0 V_TYPE 0 P_USER 0 P_SAFE 0 P_PSN 0 P_AGE 0 P_SEX 0 P_ID 0 V_YEAR 0 V_ID 0 C_MNTH 0 C_RSUR 0 C_WTHR 0 C_RCFG 0 C_CONF 0 C_HOUR 0 C_WDAY 0 C_SEV 0 dtype: int64
La variable 'C_VEHS' cuanta con la codificación para valores faltantes, rellenamos estos missing con el valor correspondiente
data.C_VEHS.fillna('UU', inplace=True)
data.isnull().sum().sort_values(ascending=False)
C_YEAR 0 V_TYPE 0 P_USER 0 P_SAFE 0 P_PSN 0 P_AGE 0 P_SEX 0 P_ID 0 V_YEAR 0 V_ID 0 C_MNTH 0 C_RSUR 0 C_WTHR 0 C_RCFG 0 C_CONF 0 C_VEHS 0 C_HOUR 0 C_WDAY 0 C_SEV 0 dtype: int64
data.columns
Index(['C_YEAR', 'C_MNTH', 'C_WDAY', 'C_HOUR', 'C_VEHS', 'C_CONF', 'C_RCFG',
'C_WTHR', 'C_RSUR', 'V_ID', 'V_TYPE', 'V_YEAR', 'P_ID', 'P_SEX',
'P_AGE', 'P_PSN', 'P_SAFE', 'P_USER', 'C_SEV'],
dtype='object')
Generaré variables dummies ya que al transformar a estas variables a numéricas, tanto las observaciones N comos las U se tranformarán en NA y no tienen el mismo tratamiento dada la diferencia en las distribuciones de la variable objetivo en cada caso particular
numerical_vars = ['C_YEAR', 'C_MNTH', 'C_WDAY', 'C_HOUR', 'P_ID', 'V_ID', 'V_YEAR', 'P_AGE']
unknown = ['N', 'NN', 'NNN', 'NNNN', 'U', 'UU', 'UUU', 'UUUU']
for i in numerical_vars:
for j in unknown:
data[i + '_' + j] = np.where(data[i] == j, 1, 0)
for i in numerical_vars:
data[i] = pd.to_numeric(data[i], errors='coerce', downcast='integer')
data = data.loc[:, (data != 0).any(axis=0)]
data.shape
(1463204, 29)
categorical_vals = [c for c in data if c not in numerical_vars][1:10]
categorical_vals
['C_CONF', 'C_RCFG', 'C_WTHR', 'C_RSUR', 'V_TYPE', 'P_SEX', 'P_PSN', 'P_SAFE', 'P_USER']
data['V_TYPE'] = np.where(data['V_TYPE'].isin(['05', '06', '07', '08', '18', '19', '20', '21']), 'truck', data['V_TYPE'])
data['V_TYPE'] = np.where(data['V_TYPE'].isin(['09', '10', '11']), 'bus', data['V_TYPE'])
data['V_TYPE'] = np.where(data['V_TYPE'].isin(['14', '16', '17', '22']), 'bike', data['V_TYPE'])
data['V_TYPE'] = np.where(data['V_TYPE'].isin(['01', '23']), 'car', data['V_TYPE'])
data = data[~data['V_TYPE'].isin(['QQ', 'UU'])]
data['V_TYPE'].value_counts()
car 1205938 truck 103543 bike 64798 NN 56812 bus 18405 Name: V_TYPE, dtype: int64
data['P_PSN'] = np.where(data['P_PSN'].isin(['11', '12', '13']), 'front', data['P_PSN'])
data['P_PSN'] = np.where(data['P_PSN'].isin(['21', '22', '23']), 'middle', data['P_PSN'])
data['P_PSN'] = np.where(data['P_PSN'].isin(['31', '32', '33']), 'back', data['P_PSN'])
data['P_PSN'] = np.where(data['P_PSN'].isin(['96']), 'occupant', data['P_PSN'])
data['P_PSN'] = np.where(data['P_PSN'].isin(['98', '97']), 'unsecure', data['P_PSN'])
data['P_PSN'] = np.where(data['P_PSN'].isin(['99']), 'pedestrian', data['P_PSN'])
data['P_PSN'] = np.where(data['P_PSN'].isin(['UU', 'NN', 'QQ']), 'other', data['P_PSN'])
data['P_PSN'].value_counts()/len(data)
front 0.835499 middle 0.096529 pedestrian 0.035900 other 0.016560 occupant 0.007459 back 0.006911 unsecure 0.001143 Name: P_PSN, dtype: float64
data['P_SAFE'] = np.where(data['P_SAFE'].isin(['02', '09', '11', '12']), 'safe', data['P_SAFE'])
data['P_SAFE'] = np.where(data['P_SAFE'].isin(['01', '10', '13']), 'unsafe', data['P_SAFE'])
data['P_SAFE'] = np.where(data['P_SAFE'].isin(['QQ', 'NN', 'UU']), 'other', data['P_SAFE'])
data['P_SAFE'].value_counts()/len(data)
safe 0.732837 other 0.208354 unsafe 0.058809 Name: P_SAFE, dtype: float64
data['P_USER'] = np.where(data['P_USER'].isin(['1', '2']), 'vehicle', data['P_USER'])
data['P_USER'] = np.where(data['P_USER'].isin(['3']), 'pedestrian', data['P_USER'])
data['P_USER'] = np.where(data['P_USER'].isin(['4']), 'bicyclist', data['P_USER'])
data['P_USER'] = np.where(data['P_USER'].isin(['5']), 'motorcyclist', data['P_USER'])
data['P_USER'] = np.where(data['P_USER'].isin(['U']), 'other', data['P_USER'])
data['P_SEX'].value_counts()/len(data)
M 0.540434 F 0.418898 U 0.038145 N 0.002523 Name: P_SEX, dtype: float64
data = data[~data['P_SEX'].isin(['N'])]
data['P_SEX'].value_counts()
M 783357 F 607191 U 55291 Name: P_SEX, dtype: int64
Lo que interesa en lugar de saber el año de fabricación de los autos, es la antigüedad que los mismos tenían en la fecha del accidente
data['V_ANT'] = data['C_YEAR'] - data['V_YEAR']
data.drop(columns=['V_YEAR'], inplace=True)
#data['V_ANT'].value_counts()/len(data)
Observo valores negativos en la antigüedad de los autos, valores que serán filtrados en el dataset
data = data[(data['V_ANT'] > 0)|(data['V_ANT'].isna())]
#data['V_ANT'].value_counts()
data['V_ANT'] = np.where((data['V_ANT'] > 25), 25, data['V_ANT'])
data['C_RCFG'] = np.where((~data['C_RCFG'].isin(['02','01','UU','03','QQ','05'])), 'OT', data['C_RCFG'])
#data['C_RCFG'].value_counts()/len(data)
data = data[~data['C_WTHR'].isin(['Q'])]
data['C_WTHR'] = np.where((data['C_WTHR'].isin(['5','6','7'])), 'O', data['C_WTHR'])
#data['C_WTHR'].value_counts()/len(data)
Agruparemos a las observaciones 9, 8, 7 y 6 en una adiocional llamada 'O'
data['C_RSUR'] = np.where((data['C_RSUR'].isin(['9','8','7', '6'])), 'O', data['C_RSUR'])
#data['C_RSUR'].value_counts()/len(data)
data.drop(columns=['C_MNTH_UU', 'C_WDAY_U', 'C_HOUR_UU', 'P_ID_NN', 'P_ID_UU',
'V_ID_UU', 'P_AGE_NN'], inplace=True)
data.dropna(subset=['V_ID', 'P_ID'], inplace=True)
columns_with_na = ['C_MNTH', 'C_WDAY', 'C_HOUR', 'P_AGE', 'V_ANT']
data_to_fillna = data[columns_with_na]
model_list = ['SimpleImputer_trained', 'TargetEncoder_model']
unpickeled_models = []
for model in list(model_list):
''' use after the model is pickled '''
imputer = open('../models/'+ model +'.pickle',"rb")
unpickeled_models.append(pickle.load(imputer))
imputer.close()
data_to_fillna = unpickeled_models[0].transform(data_to_fillna)
data[columns_with_na] = data_to_fillna
Tranformamos a las horas, los dias de la semana y los meses en dos variables tomando el seno y el coseno de la variable, para captar el movimiento cíclico de la hora, ya que después de las 24 comienza un ciclo desde 0 a 24 nuevamente. Lo mismo con el resto de las variables
data['C_HOUR_SIN'] = np.sin(2 * np.pi * data['C_HOUR']/24)
data['C_HOUR_COS'] = np.cos(2 * np.pi * data['C_HOUR']/24)
data['C_WDAY_SIN'] = np.sin(2 * np.pi * data['C_WDAY']/7)
data['C_WDAY_COS'] = np.cos(2 * np.pi * data['C_WDAY']/7)
data['C_MNTH_SIN'] = np.sin(2 * np.pi * data['C_MNTH']/12)
data['C_MNTH_COS'] = np.cos(2 * np.pi * data['C_MNTH']/12)
data.drop(columns=['C_HOUR', 'C_WDAY', 'C_MNTH'], inplace=True)
not_treated_columns = [c for c in data.columns if c not in ['C_HOUR_SIN', 'C_HOUR_COS', 'C_MNTH_SIN',
'C_MNTH_COS', 'C_WDAY_SIN', 'C_WDAY_COS',
'C_YEAR', 'C_AGE', 'V_ANT',
'C_SEV', 'V_YEAR_NNNN', 'V_YEAR_UUUU', 'P_AGE_UU',
'V_ID', 'P_ID', 'P_AGE']]
data[not_treated_columns].head()
| C_VEHS | C_CONF | C_RCFG | C_WTHR | C_RSUR | V_TYPE | P_SEX | P_PSN | P_SAFE | P_USER | |
|---|---|---|---|---|---|---|---|---|---|---|
| 87310 | 02 | 34 | 02 | 1 | 2 | car | F | front | safe | vehicle |
| 2333390 | 02 | 01 | 2 | 1 | car | M | front | safe | vehicle | |
| 1485833 | 03 | 21 | UU | 1 | 1 | car | F | front | safe | vehicle |
| 1155103 | 02 | 21 | 01 | 1 | 1 | car | M | front | other | vehicle |
| 3970807 | 02 | 35 | UU | 1 | Q | car | M | front | safe | vehicle |
data.drop(columns= ['V_ID', 'P_ID'], inplace=True)
data = unpickeled_models[1].transform(data)
data.head()
| C_YEAR | C_VEHS | C_CONF | C_RCFG | C_WTHR | C_RSUR | V_TYPE | P_SEX | P_AGE | P_PSN | ... | V_YEAR_NNNN | V_YEAR_UUUU | P_AGE_UU | V_ANT | C_HOUR_SIN | C_HOUR_COS | C_WDAY_SIN | C_WDAY_COS | C_MNTH_SIN | C_MNTH_COS | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 87310 | 1999 | 0.012907 | 0.012065 | 0.009720 | 0.015883 | 0.014277 | 0.014307 | 0.012568 | 3.0 | 0.015287 | ... | 0 | 0 | 0 | 6.0 | 7.071068e-01 | -7.071068e-01 | -2.449294e-16 | 1.000000 | 1.000000 | 6.123234e-17 |
| 2333390 | 2004 | 0.012907 | 0.018530 | 0.027714 | 0.020966 | 0.016904 | 0.014307 | 0.020431 | 43.0 | 0.015287 | ... | 0 | 0 | 0 | 15.0 | 1.224647e-16 | -1.000000e+00 | 9.749279e-01 | -0.222521 | -1.000000 | -1.836970e-16 |
| 1485833 | 2002 | 0.013298 | 0.003711 | 0.011817 | 0.015883 | 0.016904 | 0.014307 | 0.012568 | 25.0 | 0.015287 | ... | 0 | 0 | 0 | 16.0 | -5.000000e-01 | 8.660254e-01 | 9.749279e-01 | -0.222521 | -0.866025 | -5.000000e-01 |
| 1155103 | 2001 | 0.012907 | 0.003711 | 0.027714 | 0.015883 | 0.016904 | 0.014307 | 0.020431 | 46.0 | 0.015287 | ... | 0 | 0 | 0 | 13.0 | -1.000000e+00 | -1.836970e-16 | -4.338837e-01 | -0.900969 | -0.866025 | 5.000000e-01 |
| 3970807 | 2009 | 0.012907 | 0.013763 | 0.011817 | 0.015883 | 0.019240 | 0.014307 | 0.020431 | 45.0 | 0.015287 | ... | 0 | 0 | 0 | 5.0 | 1.000000e+00 | 6.123234e-17 | 4.338837e-01 | -0.900969 | 0.866025 | 5.000000e-01 |
5 rows × 23 columns
data.to_csv(r'../data/test_data_personal_prob.csv')
def data_preprocess(data):
# primero todas las librerías por bloques
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
import pickle
from random import seed
data = data.drop_duplicates()
y = pd.get_dummies(data['C_SEV'])
y.columns = ['Fatalities', 'No Fatalities']
data['C_SEV'] = y['Fatalities']
data.C_VEHS.fillna('UU', inplace=True)
numerical_vars = ['C_YEAR', 'C_MNTH', 'C_WDAY', 'C_HOUR', 'P_ID', 'V_ID', 'V_YEAR', 'P_AGE']
unknown = ['N', 'NN', 'NNN', 'NNNN', 'U', 'UU', 'UUU', 'UUUU']
for i in numerical_vars:
for j in unknown:
data[i + '_' + j] = np.where(data[i] == j, 1, 0)
for i in numerical_vars:
data[i] = pd.to_numeric(data[i], errors='coerce', downcast='integer')
data = data.loc[:, (data != 0).any(axis=0)]
categorical_vals = [c for c in data if c not in numerical_vars][1:10]
data['V_TYPE'] = np.where(data['V_TYPE'].isin(['05', '06', '07', '08', '18', '19', '20', '21']), 'truck', data['V_TYPE'])
data['V_TYPE'] = np.where(data['V_TYPE'].isin(['09', '10', '11']), 'bus', data['V_TYPE'])
data['V_TYPE'] = np.where(data['V_TYPE'].isin(['14', '16', '17', '22']), 'bike', data['V_TYPE'])
data['V_TYPE'] = np.where(data['V_TYPE'].isin(['01', '23']), 'car', data['V_TYPE'])
data = data[~data['V_TYPE'].isin(['QQ', 'UU'])]
data['P_PSN'] = np.where(data['P_PSN'].isin(['11', '12', '13']), 'front', data['P_PSN'])
data['P_PSN'] = np.where(data['P_PSN'].isin(['21', '22', '23']), 'middle', data['P_PSN'])
data['P_PSN'] = np.where(data['P_PSN'].isin(['31', '32', '33']), 'back', data['P_PSN'])
data['P_PSN'] = np.where(data['P_PSN'].isin(['96']), 'occupant', data['P_PSN'])
data['P_PSN'] = np.where(data['P_PSN'].isin(['98', '97']), 'unsecure', data['P_PSN'])
data['P_PSN'] = np.where(data['P_PSN'].isin(['99']), 'pedestrian', data['P_PSN'])
data['P_PSN'] = np.where(data['P_PSN'].isin(['UU', 'NN', 'QQ']), 'other', data['P_PSN'])
data['P_SAFE'] = np.where(data['P_SAFE'].isin(['02', '09', '11', '12']), 'safe', data['P_SAFE'])
data['P_SAFE'] = np.where(data['P_SAFE'].isin(['01', '10', '13']), 'unsafe', data['P_SAFE'])
data['P_SAFE'] = np.where(data['P_SAFE'].isin(['QQ', 'NN', 'UU']), 'other', data['P_SAFE'])
data['P_USER'] = np.where(data['P_USER'].isin(['1', '2']), 'vehicle', data['P_USER'])
data['P_USER'] = np.where(data['P_USER'].isin(['3']), 'pedestrian', data['P_USER'])
data['P_USER'] = np.where(data['P_USER'].isin(['4']), 'bicyclist', data['P_USER'])
data['P_USER'] = np.where(data['P_USER'].isin(['5']), 'motorcyclist', data['P_USER'])
data['P_USER'] = np.where(data['P_USER'].isin(['U']), 'other', data['P_USER'])
data = data[~data['P_SEX'].isin(['N'])]
data['V_ANT'] = data['C_YEAR'] - data['V_YEAR']
data.drop(columns=['V_YEAR'], inplace=True)
data = data[(data['V_ANT'] > 0)|(data['V_ANT'].isna())]
data['V_ANT'] = np.where((data['V_ANT'] > 25), 25, data['V_ANT'])
data['C_RCFG'] = np.where((~data['C_RCFG'].isin(['02','01','UU','03','QQ','05'])), 'OT', data['C_RCFG'])
data = data[~data['C_WTHR'].isin(['Q'])]
data['C_WTHR'] = np.where((data['C_WTHR'].isin(['5','6','7'])), 'O', data['C_WTHR'])
data['C_RSUR'] = np.where((data['C_RSUR'].isin(['9','8','7', '6'])), 'O', data['C_RSUR'])
data.drop(columns=['C_MNTH_UU', 'C_WDAY_U', 'C_HOUR_UU', 'P_ID_NN', 'P_ID_UU', 'V_ID_UU', 'P_AGE_NN'], inplace=True)
data.dropna(subset=['V_ID', 'P_ID'], inplace=True)
columns_with_na = ['C_MNTH', 'C_WDAY', 'C_HOUR', 'P_AGE', 'V_ANT']
data_to_fillna = data[columns_with_na]
model_list = ['SimpleImputer_trained', 'TargetEncoder_model']
unpickeled_models = []
for model in list(model_list):
''' use after the model is pickled '''
imputer = open('../models/'+ model +'.pickle',"rb")
unpickeled_models.append(pickle.load(imputer))
imputer.close()
data_to_fillna = unpickeled_models[0].transform(data_to_fillna)
data[columns_with_na] = data_to_fillna
data['C_HOUR_SIN'] = np.sin(2 * np.pi * data['C_HOUR']/24)
data['C_HOUR_COS'] = np.cos(2 * np.pi * data['C_HOUR']/24)
data['C_WDAY_SIN'] = np.sin(2 * np.pi * data['C_WDAY']/7)
data['C_WDAY_COS'] = np.cos(2 * np.pi * data['C_WDAY']/7)
data['C_MNTH_SIN'] = np.sin(2 * np.pi * data['C_MNTH']/12)
data['C_MNTH_COS'] = np.cos(2 * np.pi * data['C_MNTH']/12)
data.drop(columns=['C_HOUR', 'C_WDAY', 'C_MNTH'], inplace=True)
not_treated_columns = [c for c in data.columns if c not in ['C_HOUR_SIN', 'C_HOUR_COS', 'C_MNTH_SIN',
'C_MNTH_COS', 'C_WDAY_SIN', 'C_WDAY_COS', 'C_YEAR', 'C_AGE', 'V_ANT',
'C_SEV', 'V_YEAR_NNNN', 'V_YEAR_UUUU', 'P_AGE_UU', 'V_ID', 'P_ID', 'P_AGE']]
data.drop(columns= ['V_ID', 'P_ID'], inplace=True)
data = unpickeled_models[1].transform(data)
return data
test = pd.read_csv("../data/Untransformed_test_data_personal_prob.csv",low_memory=False)
test.set_index(['Unnamed: 0'], inplace=True)
test.index.names = ['']
test2 = data_preprocess(test)
data.equals(test2)
True